#!/usr/bin/env python
#coding=utf8
'''
爬去新闻页面
'''
import os
import re
import urllib
import chardet
#娱乐
def yule(url):#娱乐
	'''
	字典一一对应
	'''
	html=urllib.urlopen(url)
	htmlread=html.read().decode('utf-8')
	retitle=r'class="margin_0">(.*?)</a>'
	htmltext=r'a target="_blank" href="(.*?html)" class='
	htmltmp=re.findall(htmltext,htmlread)
	title=re.findall(retitle,htmlread)#ascii编码需要转换
	#title=str(title).decode('ascii').encode('utf-8')#转换成utf-8编码
	#title=str(title).encode('utf-8')
	#print title
	a={}
	for i in range(0,6):
		a[title[i]]=htmltmp[i]
	return a


####################################################33
#社会新闻

def shehui(url):
	html=urllib.urlopen(url)
	htmlread=html.read().decode('utf-8')
	re_url=r'<a target="_blank" href="(.*?shtml)" class="npt">'
	re_title=r'target=".*?" suda-uatrack=".*?">(.*?)</a></h2>'
	re_url_text=re.findall(re_url,htmlread)
	re_title_text=re.findall(re_title,htmlread)
	return re_url_text,re_title_text

#########################################################33
def junshi(url):#军事
	'''
	字典一一对应
	'''
	html=urllib.urlopen(url)
	htmlread=html.read().decode('utf-8')
	re_a=r'a href="(.*?shtml)"'
	#re_b=r'<img src="(.*?jpg)" height="358" width="638" '
	re_c=r'width="638" alt="(.*?)"/></a>'
	re_a_html=re.findall(re_a,htmlread)#网址
	#re_b_html=re.findall(re_b,htmlread)#图片
	re_c_html=re.findall(re_c,htmlread)#标题
	 #b=[]
	a={}
	for i in range(len(re_c_html)-1):
	 	#a[re_a_html[i]]=re_b_html[i]#a{0-3}是标题，a{.html:.jpg}
		a[re_c_html[i]]=re_a_html[i]
	 # b.append(a)
	 # b.append(re_c_html)
	return a
	############################
def tiyu(url):#体育
	html=urllib.urlopen(url)
	htmlread=html.read().decode('utf-8')
	re_html=r'<a style="display:block" href="(.*?html)" target="_blank"><img'
	re_title=r'alt="(.*?\W)" /></a>'
	re_jpg=r'target="_blank"><img src="(.*?jpg)" alt='
	html=re.findall(re_html,htmlread,re.S)
	title=re.findall(re_title,htmlread)
	jpg=re.findall(re_jpg,htmlread)
	return jpg[0:5],title[0:5],html[0:5]	# a={}
	# for i in range(len(title)-1):
	# 	a[title[i]]=html
	#print jpg,'\n\n',html,'\n\n',title
# a=tiyu('http://sports.sina.com.cn/')
# for i in a[1]:
# 	print i,a[1][i]
